lets take the data of cocid-19 disease,Coronavirus disease (COVID-19) is caused by Severe acute respiratory syndrome Coronavirus 2 (SARS-CoV-2) and has had a worldwide effect. On March 11, 2020, the World Health Organization (WHO) declared it a pandemic, pointing to the over 118,000 cases of the coronavirus illness in over 110 countries and territories around the world at the time.
# lets load the dataset which provided by WHO
import pandas as pd
import numpy as np
covid_data=pd.read_csv('covid_data.csv',parse_dates=['Date'])
covid_data
# as the date column is in object dtype so we converted that column to datetime using(parse_dates)
active_cases= covid_data['Confirmed'] - covid_data['Deaths'] - covid_data['Recovered']
covid_data['Active_cases']= active_cases
covid_data
# we can see the date format has changed to datetime.
covid_data.dtypes
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(covid_data.isnull())
plt.show()
#we can notice there is no null value present in the datasets, so we can use the datasets for4 further process.
covid_data.isnull().sum()
covid_data.describe()
# we can see there is large gap between minimum and maximum value so there wil be high standard deviation.
# There also we can notice there must be outliers present beacuse there is high gap between mean and median.
covid_data_dates=pd.DataFrame()
covid_data_dates['month']=covid_data['Date'].dt.month_name()
covid_data_dates['day']=covid_data['Date'].dt.day_name()
covid_data_dates['year']=covid_data['Date'].dt.year
covid_data_dates['deaths']=covid_data['Deaths']
covid_data_dates['recovered']=covid_data['Recovered']
covid_data_dates['confirmed']=covid_data['Confirmed']
covid_data_dates['active_cases']=covid_data['Active_cases']
covid_data_dates
# lets drop the year column because its repeating and not unique also
covid_data_dates['year'].drop_duplicates(inplace=True)
lets get deep into the dataset
# lets check deathcases world wide everymonth
Deathcases_per_month = pd.DataFrame(covid_data_dates.groupby('month')['deaths'].sum()).T
Deathcases_per_month
# lets check Average deathcases world wide everyday
Deathcases_avg_day = pd.DataFrame(covid_data_dates.groupby('day')['deaths'].sum()).T
Deathcases_avg_day
#plotting Bar Plot
print('Worldwide Deaths happens each Months and ','\n','The Maximum number of death occors in the month of May which is = 7746343')
sns.barplot(x='month',y='deaths',data=covid_data_dates)
plt.show()
print('\n')
print('Worldwide Average Deaths happens each day')
#plotting death with respect to day
sns.barplot(x='day',y='deaths',data=covid_data_dates)
plt.show()
# with lineplot we can see how Deathcases varies with with month
sns.lineplot(x='month',y='deaths',data=covid_data_dates)
plt.show()
# we can notice the death pattern increasing rapidly after the moth of March.
# lets check recovered cases world wide everymonth
Recoveredcases_per_month = pd.DataFrame(covid_data_dates.groupby('month')['recovered'].sum()).T
Recoveredcases_per_month
#plotting Bar Plot
print('Worldwide recovered cases observed each Months and ','\n','The Maximum number of recovered cases observed in the month of May which is = 41747602')
sns.barplot(x='month',y='recovered',data=covid_data_dates)
plt.show()
print('\n')
#lets plot count plot
print('we can observe the recovered cases increasing every month, the Recovery rate is faster than Death cases each month')
sns.lineplot(x='month',y='recovered',data=covid_data_dates)
plt.show()
# lets check Average recovered cases world wide everyday
Recoveredcases_avg_day = pd.DataFrame(covid_data_dates.groupby('day')['recovered'].sum()).T
Recoveredcases_avg_day
print('Worldwide Average recovered cases observed each day')
#plotting death with respect to day
sns.barplot(x='day',y='recovered',data=covid_data_dates)
plt.show()
# lets check confirmed cases world wide everymonth
confirmedcases_per_month = pd.DataFrame(covid_data_dates.groupby('month')['confirmed'].sum()).T
confirmedcases_per_month
#plotting Bar Plot
print('Worldwide Confirmed cases observed each Months and ','\n','The Maximum number of confirmed cases observed in the month of May which is = 115121451')
sns.barplot(x='month',y='confirmed',data=covid_data_dates)
plt.show()
print('\n')
#lets plot count plot
print('we can observe the confirmed cases increasing every month')
sns.lineplot(x='month',y='confirmed',data=covid_data_dates)
plt.show()
# lets check Average confirmed cases world wide everyday
confirmedcases_avg_day = pd.DataFrame(covid_data_dates.groupby('day')['confirmed'].sum()).T
confirmedcases_avg_day
print('Worldwide Average confirmed cases observed each day')
#plotting confirmed cases with respect to day
sns.barplot(x='day',y='confirmed',data=covid_data_dates)
plt.show()
# lets check Active cases world wide everymonth
activecases_per_month = pd.DataFrame(covid_data_dates.groupby('month')['active_cases'].sum()).T
activecases_per_month
#plotting Bar Plot
print('Worldwide active cases observed each Months and ','\n','The Maximum number of active cases observed in the month of May which is = 65627506','\n','we can also notice the least case at starting of covid which is 36801 in january')
sns.barplot(x='month',y='active_cases',data=covid_data_dates)
plt.show()
print('\n')
#lets plot count plot
print('we can observe the active cases increasing every month ')
sns.lineplot(x='month',y='active_cases',data=covid_data_dates)
plt.show()
# lets check Average Active cases world wide everyday
activecases_avg_day = pd.DataFrame(covid_data_dates.groupby('day')['active_cases'].sum()).T
activecases_avg_day
print('Worldwide Average active cases observed each day')
#plotting confirmed cases with respect to day
sns.barplot(x='day',y='active_cases',data=covid_data_dates)
plt.show()
All_cases_month = pd.DataFrame(covid_data_dates.groupby('month')['confirmed','active_cases','recovered','deaths'].sum()).T
All_cases_month
All_cases_month.plot(kind='bar',title='ALL_Cases_Everymonth',figsize=(12,6))
cases=covid_data[['Confirmed','Active_cases','Recovered','Deaths']]
cases.sum().T
# we can observe the cases count world wide.
sns.heatmap(cases.corr(),annot=True,cmap='Accent')
# lets seperate confirmed cases to a seperate column according to the countries
confirmedcases = pd.DataFrame(covid_data.groupby('Country')['Confirmed'].sum())
confirmedcases['Country'] = confirmedcases.index #lets set index so that we can plot and use it further
confirmedcases.index = np.arange(1,189) # becuase we have 188 column.
world_confirmedcases = confirmedcases[['Country','Confirmed']]
# lets seperate active cases to a seperate column according to the countries
activecases = pd.DataFrame(covid_data.groupby('Country')['Active_cases'].sum())
activecases['Country'] = activecases.index
activecases.index = np.arange(1,189) # (1,189) becuase we have 188 column.
world_activecases = activecases[['Country','Active_cases']]
world_activecases
# lets seperate recovered cases to a seperate column according to the countries
recoveredcases = pd.DataFrame(covid_data.groupby('Country')['Recovered'].sum())
recoveredcases['Country'] = recoveredcases.index
recoveredcases.index = np.arange(1,189) # becuase we have 188 column.
world_recoveredcases = recoveredcases[['Country','Recovered']]
# lets seperate death cases to a seperate column according to the countries
deathcases = pd.DataFrame(covid_data.groupby('Country')['Deaths'].sum())
deathcases['Country'] = deathcases.index
deathcases.index = np.arange(1,189) # becuase we have 188 column.
world_deathcases = deathcases[['Country','Deaths']]
world_deathcases.to_excel('m.xlsx')
import plotly.express as px
# I am plotting confirmed cases based on countries.
fig = px.bar(world_confirmedcases.sort_values('Confirmed',ascending=False)[:20][::-1],x='Confirmed',y='Country',title='Confirmed Cases Worldwide',text='Confirmed', height=800, orientation='h')
fig.show()
# I am plotting Active cases based on countries.
fig = px.bar(world_activecases.sort_values('Active_cases',ascending=False)[:20][::-1],x='Active_cases',y='Country',title='Active Cases Worldwide',text='Active_cases', height=800, orientation='h')
fig.show()
# I am plotting Recovered cases based on countries.
fig = px.bar(world_recoveredcases.sort_values('Recovered',ascending=False)[:20][::-1],x='Recovered',y='Country',title='Recovered Cases Worldwide',text='Recovered', height=800, orientation='h')
fig.show()
# I am plotting Death cases based on countries.
fig = px.bar(world_deathcases.sort_values('Deaths',ascending=False)[:20][::-1],x='Deaths',y='Country',title='Death Cases Worldwide',text='Deaths', height=800, orientation='h')
fig.show()
#lets implement pair plot
sns.pairplot(covid_data)
# we can see every graph is increasing as the time increasing.
#lets start doing implementing various algorithm.
# lets check the outliers present in the data sets
from scipy.stats import zscore
z_score=abs(zscore(cases))
print(cases.shape)
new_data=cases.loc[(z_score<3).all(axis=1)]
print(new_data.shape)
# we can see there were outliers present in the datasets
df_x=new_data.drop(['Deaths'],axis=1)
y=pd.DataFrame(new_data['Deaths'])
# scaling the input variable
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(df_x)
x=pd.DataFrame(x,columns=df_x.columns)
#lets apply linear regresion to the model considering target as Death
#lets apply regression to datasets
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
def maxr2_score(regr,x,y): #Def is used such that we can call it later
max_r_score=0
for r_state in range(42,100):
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=r_state,test_size=0.20)
regr.fit(x_train,y_train)
y_pred=regr.predict(x_test)
r2_scr=r2_score(y_test,y_pred)
if r2_scr>max_r_score:
max_r_score=r2_scr
final_r_state=r_state
print()
print('max r2 score correponding to',final_r_state,'is',max_r_score)
return final_r_state
from sklearn.linear_model import LinearRegression
lreg=LinearRegression()
r_state=maxr2_score(lreg,x,y)
#lets use the cross validation to check above is overfitting or not
from sklearn.model_selection import cross_val_score
a_score= cross_val_score(lreg,x,y,cv=5,scoring='r2').mean()
print('cross val score',a_score)
# by cross validation we came to know that our model is overfitting.
# lets try and check other models also
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
neighbors={'n_neighbors':range(1,30)}
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=r_state,test_size=0.20)
knr=KNeighborsRegressor()
gknr=GridSearchCV(knr,neighbors,cv=10)
gknr.fit(x_train,y_train)
gknr.best_params_
knr=KNeighborsRegressor(n_neighbors=3)
r_state=maxr2_score(knr,x,y)
print('mean cross val score for KNN regression:',cross_val_score(knr,x,y,cv=5,scoring='r2').mean())
print('standard deviation in r2 score for KNN Regression',cross_val_score(knr,x,y,cv=5,scoring='r2').std())
#lets apply gradientboostingregressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')
gbr=GradientBoostingRegressor()
parameters={'learning_rate':[0.001,0.01,0.1,1],'n_estimators':[10,100,500,100]}
# use n_estimator with step of 50
clf=GridSearchCV(gbr,parameters,cv=5)
clf.fit(x_train,y_train)
clf.best_params_
gbr=GradientBoostingRegressor(learning_rate= 0.1, n_estimators= 500)
r_state=maxr2_score(gbr,x,y)
print('mean cross val score for GBR regression:',cross_val_score(gbr,x,y,cv=5,scoring='r2').mean())
print('standard deviation in r2 score for GBR Regression',cross_val_score(gbr,x,y,cv=5,scoring='r2').std())
# we can see the linear model is giving best r2 score and cross validation score lets finalise the model linear regression.
# lets make our final model
x_train,x_test,y_train,y_test=train_test_split(df_x,y,random_state=42,test_size=0.20)
lreg=LinearRegression()
lreg.fit(x_train,y_train)
y_pred=lreg.predict(x_test)
# lets findout the maximum r2_score and save the model
from sklearn.metrics import r2_score,mean_squared_error
print('mean cross val score for GBR regression:',cross_val_score(lreg,df_x,y,cv=5,scoring='r2').mean())
print('RMSE',np.sqrt(mean_squared_error(y_test,y_pred)))
print('r2_score is: ',r2_score(y_test,y_pred))
import joblib
# save the model as pickle file
joblib.dump(lreg,'covid_data_linear_regr.pkl')